Preprocessing + Statistical Modeling

Dimension Reduction

Clustering
Clustering
Dimensions
Dimensions
# load in the neccessary data , extract important information 

library(devtools)
## Loading required package: usethis
library(Biobase)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file=con)
close(con)
mp = montpick.eset
pdata=pData(mp)
edata=as.data.frame(exprs(mp))
fdata = fData(mp)
ls()
## [1] "con"           "edata"         "fdata"         "montpick.eset"
## [5] "mp"            "pdata"

Plotting the singular values

edata = log2(edata + 1) # normalize the data 
edata_centered = edata - rowMeans(edata) # center the data (eliminates mean as the source of variation)
svd1 = svd(edata_centered) # single value decomposition makes d, u, v 

# plot the d vector 

plot(svd1$d,ylab="Singular value",col=2) # plot the d values 

# variance explained 
plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2)

Singular Components and Principle Comps:

# plot the first and second singular vectors (not really apparently) (v first and second columns)

par(mfrow=c(1,2))
plot(svd1$v[,1],col=2,ylab="1st PC")
plot(svd1$v[,2],col=3,ylab="2nd PC")

# plot them against each other, colors are for which study they come from 
plot(svd1$v[,1],svd1$v[,2],ylab="2nd PC",xlab="1st PC", col = c(4,6))
# boxplot of the first principle in both studies 
par(mfrow=c(1,1))

boxplot(svd1$v[,1] ~ pdata$study,border=c(1,2))
points(svd1$v[,1] ~ jitter(as.numeric(pdata$study)),col=as.numeric(pdata$study))

# PCs versus SVs 
pc1 = prcomp(edata)
    # first pc vs first singular vector 
plot(pc1$rotation[,1],svd1$v[,1])

# if you scale PCs and SVs the same way (by column), PV = SV
edata_centered2 = t(t(edata) - colMeans(edata))
svd2 = svd(edata_centered2)
plot(pc1$rotation[,1],svd2$v[,1],col=2)

The Outlier Dataset

edata_outlier = edata_centered
# make a very outlying gene
edata_outlier[1,] = edata_centered[1,] * 10000
# get the vectors for outlying dataset 
svd3 = svd(edata_outlier)

# compare the singular vectors with outlier, without outlier
par(mfrow=c(1,2))
plot(svd1$v[,1],col=1,main="Without outlier") # normal scatter
plot(svd3$v[,1],col=2,main="With outlier") # weird behavior 

plot(svd3$v[,1],edata_outlier[1,],col=4) # very linear 

Preprocessing + Normalization

Quantile Normalization (in R)

library(devtools)
library(Biobase)
library(preprocessCore)

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file=con)
close(con)
mp = montpick.eset
pdata=pData(mp)
edata=as.data.frame(exprs(mp))
fdata = fData(mp)
ls()
##  [1] "con"             "edata"           "edata_centered"  "edata_centered2"
##  [5] "edata_outlier"   "fdata"           "montpick.eset"   "mp"             
##  [9] "pc1"             "pdata"           "svd1"            "svd2"           
## [13] "svd3"
dim(edata) # wrong dimensions lol 
## [1] 52580   129
edata = log2(edata + 1)
edata = edata[rowMeans(edata) > 3, ]
colramp = colorRampPalette(c(3,"white",2))(20)
plot(density(edata[,1]),col=colramp[1],lwd=3,ylim=c(0,.30))
for(i in 2:20){lines(density(edata[,i]),lwd=3,col=colramp[i])}

# actually quantile normalize 
norm_edata = normalize.quantiles(as.matrix(edata))
plot(density(norm_edata[,1]),col=colramp[1],lwd=3,ylim=c(0,.20))
for(i in 2:20){lines(density(norm_edata[,i]),lwd=3,col=colramp[i])} # the density plot shows that all the samples are now lined up (for the most part)

# can still have batch effects in the data set though 

Linear Models

Linear Models w/ Categorical Covariates

Adjusting for covariates

Linear Regression in R

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
load(file=con)
close(con)
bm = bodymap.eset
pdata=pData(bm)
edata=as.data.frame(exprs(bm))
fdata = fData(bm)
ls()
##  [1] "bm"              "bodymap.eset"    "colramp"         "con"            
##  [5] "edata"           "edata_centered"  "edata_centered2" "edata_outlier"  
##  [9] "fdata"           "i"               "montpick.eset"   "mp"             
## [13] "norm_edata"      "pc1"             "pdata"           "svd1"           
## [17] "svd2"            "svd3"
library(devtools)
library(Biobase)
library(broom)

# fitting linear model
edata = as.matrix(edata)
#     lm( outcome( all genes, one sample), predictor (age))
lm1 = lm(edata[1,] ~ pdata$age)
tidy(lm1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   2188.     403.        5.43 0.0000888
## 2 pdata$age      -23.2      6.39     -3.64 0.00269
# plotting line
plot(edata[1,] ~ pdata$age, main = "Regression Line")
abline(lm1)

# for a categorical predictor like gender 

# relationship between gender and gene's expression 
boxplot(edata[1,] ~ pdata$gender)
points(edata[1,] ~ jitter(as.numeric(pdata$gender)),
       col=as.numeric(pdata$gender))

# r dummy variables stuff for you on its own 
lm2 = lm(edata[1,] ~ pdata$gender)
tidy(lm2)
## # A tibble: 2 × 5
##   term          estimate std.error statistic p.value
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)       837       229.     3.66  0.00258
## 2 pdata$genderM    -106.      324.    -0.326 0.749
mod_matr = model.matrix(lm2)
mod_matr # undr the hood, each gender is assigned a number
##           (Intercept) pdata$genderM
## ERS025098           1             0
## ERS025092           1             1
## ERS025085           1             0
## ERS025088           1             0
## ERS025089           1             0
## ERS025082           1             1
## ERS025081           1             0
## ERS025096           1             1
## ERS025099           1             1
## ERS025086           1             0
## ERS025083           1             0
## ERS025095           1             1
## ERS025097           1             1
## ERS025094           1             1
## ERS025090           1             0
## ERS025091           1             1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`pdata$gender`
## [1] "contr.treatment"
# somethign with more levels 
tidy(lm(edata[1,] ~ pdata$tissue.type)) # the intercept is the missing one (adipose)
## # A tibble: 17 × 5
##    term                              estimate std.error statistic p.value
##    <chr>                                <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)                         1354.      1018.    1.33    0.315 
##  2 pdata$tissue.typeadrenal           -1138.      1440.   -0.790   0.512 
##  3 pdata$tissue.typebrain             -1139.      1440.   -0.791   0.512 
##  4 pdata$tissue.typebreast             -430.      1440.   -0.299   0.793 
##  5 pdata$tissue.typecolon              -629.      1440.   -0.437   0.705 
##  6 pdata$tissue.typeheart             -1229       1440.   -0.854   0.483 
##  7 pdata$tissue.typekidney             -558.      1440.   -0.388   0.736 
##  8 pdata$tissue.typeliver               600.      1440.    0.417   0.717 
##  9 pdata$tissue.typelung               -539.      1440.   -0.374   0.744 
## 10 pdata$tissue.typelymphnode         -1105       1440.   -0.768   0.523 
## 11 pdata$tissue.typemixture            4043.      1175.    3.44    0.0751
## 12 pdata$tissue.typeovary                96.0     1440.    0.0667  0.953 
## 13 pdata$tissue.typeprostate           -573.      1440.   -0.398   0.729 
## 14 pdata$tissue.typeskeletal_muscle   -1285.      1440.   -0.893   0.466 
## 15 pdata$tissue.typetestes              534.      1440.    0.371   0.746 
## 16 pdata$tissue.typethyroid            -371.      1440.   -0.258   0.821 
## 17 pdata$tissue.typewhite_blood_cell  -1351.      1440.   -0.938   0.447

Interaction Models

lm3 = lm(edata[1,] ~ pdata$age * pdata$gender)
tidy(lm3) # term for age , gender and age interacting with gender 
## # A tibble: 4 × 5
##   term                    estimate std.error statistic p.value
##   <chr>                      <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)               1679.     610.        2.75  0.0175
## 2 pdata$age                  -13.5      9.43     -1.43  0.178 
## 3 pdata$genderM              913.     793.        1.15  0.272 
## 4 pdata$age:pdata$genderM    -18.5     12.5      -1.47  0.167
# more than one model
lm4 = lm(edata[1,] ~ pdata$age + pdata$gender)
tidy(lm4)
## # A tibble: 3 × 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     2332.     438.       5.32  0.000139
## 2 pdata$age        -23.9      6.49    -3.69  0.00274 
## 3 pdata$genderM   -207.     236.      -0.877 0.397
# always check for outliers 
lm5 = lm(edata[6,] ~ pdata$age)
plot(edata[6,] ~ pdata$age)
abline(lm5) # olutlier not effecting the line thankfully

plot(lm5) # identify the outliers from labeled qqplot

Multiple Regression

con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
bot = bottomly.eset
pdata=pData(bot)
edata=as.matrix(exprs(bot))
fdata = fData(bot)
ls()
##  [1] "bm"              "bodymap.eset"    "bot"             "bottomly.eset"  
##  [5] "colramp"         "con"             "edata"           "edata_centered" 
##  [9] "edata_centered2" "edata_outlier"   "fdata"           "i"              
## [13] "lm1"             "lm2"             "lm3"             "lm4"            
## [17] "lm5"             "mod_matr"        "montpick.eset"   "mp"             
## [21] "norm_edata"      "pc1"             "pdata"           "svd1"           
## [25] "svd2"            "svd3"
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(edge)  

edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10, ] # remove low expressions
dim(edata) # gonna end up with 36k models ???
## [1] 1049   21
# build model matrix 
mod = model.matrix(~ pdata$strain)
View(mod) # dummy encodes the strain 
#           (predictor, response)
fit = lm.fit(mod, t(edata)) # fits a model where strain predicts each gene.
length(fit$coefficients) # one model per gene/ feature where strain is the predictor 
## [1] 2098
tidy(lm(as.numeric(edata[1, ]) ~ pdata$strain)) # shows you how strain predicts the first gene/feature 
## # A tibble: 2 × 5
##   term               estimate std.error statistic  p.value
##   <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)          10.4       0.152     68.7  3.10e-24
## 2 pdata$strainDBA/2J    0.348     0.210      1.66 1.13e- 1

Batch Effects / Confounders